/* a_nsc_match.do - ************************************************************

	This matches students in the NSC data to assignment and enrollment data.
	Matches on Primary IDs.

******************************************************************************/
set more off
*Now we do the following searching processs
*For each year of matches, we first find the primary name of a studentno
*Then we use the name to look for nsc data.

*We loop over K0 to get rid of those who also applied in K0. This happens in line
*158
local grades "K0 K1"
local years "1997 1998 1999 2000 2001 2002 2003"
foreach g of local grades {
foreach y of local years {
	di "`y'"
	di "`g'"
	*Load in assignment data for the year
	use "$stata_data_assignment/`y'/pscores`g'.dta", clear

	*Merge in an indicator for if a student is properly matched
	merge 1:1 studentno using "$stata_data_assignment/`y'/correct_match`g'.dta", keep(master match) nogen

	*Find the primary name, dob, year from enrollment data
	merge 1:1 studentno using "$stata_data/primary_sample.dta", keep(match master)


	*Generate a variable tracking attrition here
	gen find_prim_name = _merge == 3
	drop _merge

	*Track the year and grade of assignment
	gen yr_asgn = `y'
	gen gr_asgn = "`g'"

	*Deal with observations without names.
	replace fname = string(_n) if fname == "" & lname == ""

	*Drop if a student has multiple observations within the same name. The random
	*number of these students are the minimum of both observations. This is shown
	*since the number for n duplicate names is the 1st order statistics of n uniform.
	duplicates tag fname lname mi dob, gen(dup_name)
	drop if dup_name > 0

	*Merge to NSC using name and dob
	preserve
		*Keep if the student has a primary name and then merge on the NSC using this name
		drop if find_prim_name == 0
		merge 1:1 fname lname mi dob using "$stata_data_nsc/preknscclean_names", keep(match master)

		gen nsc_prim_name_link = _merge == 3
		drop _merge

		*Keep only variables we care about
		tempfile full_samp
		save `full_samp', replace
	restore

	*Merge back in with original dataset.
	merge 1:1 studentno using `full_samp'

	*Load in location data
	rename asngeo geocode
	destring geocode, replace

	*Load in the location marker of each student
	merge m:1 geocode using "$stata_data_location/student_to_loc.dta", gen(_mgeo) keep(master matched)

	*Load in block level information from census
	merge m:1 geocode using "$stata_data_location/geo_data.dta", gen(_mtracts) keep(master matched)

	*Clean tract data
	rename tractce10 tract
	tostring tract, replace
	gen len = length(tract)
	replace tract = "0" + tract if len==5
	replace tract = "000" + tract if len==3
	replace tract = "00000" + tract if len==1
	gen newlen = length(tract)
	replace tract = "25025" + tract
	destring tract, replace

	drop _merge

	*Load in enrollment data from the BPS extract
	merge 1:1 fname lname mi dob studentno year using "$stata_data/cleaned_extract.dta", keep(master match)
	drop _merge

	*Clean schools data
	* Create endogenous variable (preK attendance) for project
	gen att = !inlist(sch,"4840","") // 4840 is the school code for not attending preK
	merge m:1 sch using "$stata_data/school_coding.dta", keep(match master) nogen
	levelsof sch
	foreach sch in `r(levels)' {
		gen sch_`sch' = sch == "`sch'"
	}

	*Determine preK number
	gen knum = substr(grade,-1,1)
	destring knum, replace

	*Clean some of the variables for easier analysis
	gen black = inlist(race,1,7) if race != .
	gen white = race==2 if race != .
	gen hispanic = inlist(race,4,5) if race != .
	gen asian = race == 3 if race != .
	gen female = gender=="F" if gender != ""
	gen ehomlang = homelang1=="E" if gender != ""
	gen efirst = firstlang=="E" if firstlang != ""
	gen is_biling = bilingual != ""
	gen sp_biling = bilingual == "S"
	gen frpl = inlist(food, "AF", "AR", "DC", "FR", "RF", "TA")

	*Deal with twin analysis. If a twins applied to the same PreK program at the same time
	*they share the best lottery draw of both twins. Due to this, we delete twin observations.
	*Generate a sibling id concept based on the name of the parents. The key here
	*is to restrict to the same year and same grade a student is applying to
	gen sibling_str = parentalast + parentafirst + grade+ string(yr_asgn)
	encode sibling_str, gen(sibling_id)
	drop sibling_str

	*get the number of siblings per family
	duplicates tag parentalast parentafirst grade yr_asgn, gen(num_sibling)
	replace num_sibling = num_sibling + 1

	*Check if the siblings have the same random numbers
	bysort sibling_id: egen max_rand = max(random)
	bysort sibling_id: egen min_rand = min(random)
	gen rand_same = abs(max_rand- min_rand) == 0
	drop max_rand min_rand

	*General check for if a random number is double pulled in the same year
	duplicates tag random, gen(dup_random)

	*Get first initials for future match with SIMS
	gen fl = lower(substr(fname,1,1))
	gen ll = lower(substr(lname,1,1))


	*Save dataset for analysis
	save "$stata_data_analysis/nsc/`y'/nsc_ready_`g'.dta", replace

}
}

*Merge in all names and grades together
foreach g of local grades {
clear
foreach y of local years {
	append using "$stata_data_analysis/nsc/`y'/nsc_ready_`g'.dta"
}
save "$stata_data_analysis/nsc/nsc_ready_`g'.dta", replace
}
clear
foreach g of local grades {

	append using "$stata_data_analysis/nsc/nsc_ready_`g'.dta"

}

save "$stata_data_analysis/nsc/nsc_ready.dta", replace

duplicates tag studentno, gen(dup_no)

*Only keep the first application per student number and get rid of sibling repeat data
drop if rand_same == 1 & num_sibling > 1


bysort studentno (yr_asgn) : keep if _n == 1

local var_names fname lname fl ll
foreach var of local var_names {
	replace `var' = lower(`var')
}

save "$stata_data_analysis/nsc/nsc_ready_fin.dta", replace
